import re
import jieba
import jieba.posseg as pseg
from py2neo import Graph, Node, Relationship, NodeMatcher

qut_graph = Graph(
    'http://localhost:7474',
    username='neo4j',
    password='00001011'
)

matcher = NodeMatcher(qut_graph)

university_nodes = matcher.match('学校')
school_nodes = matcher.match('学院')
campus_nodes = matcher.match('校区')
org_nodes = matcher.match('组织机构')

entities = set(university_nodes)
entities = entities.union(school_nodes)
entities = entities.union(campus_nodes)
entities = entities.union(org_nodes)

person_nodes = matcher.match('人')

coreferences = {
    '我校': '青岛理工大学',
    '学校': '青岛理工大学',
    '土木学院': '土木工程学院',
    '机械学院': '机械工程学院',
    '汽车学院': '汽车与交通学院',
    '信控学院': '信息与控制工程学院',
    '计算机学院': '计算机工程学院',
    '自动化学院': '自动化工程学院',
    '通信学院': '通信工程学院',
    '管理学院': '管理工程学院',
    '建筑学院': '建筑与城乡规划学院',
    '人文学院': '人文与社会科学学院',
    '高职学院': '高等职业学院',
    '艺术学院': '艺术与设计学院',
    '环境学院': '环境与市政工程学院',
}

with open('scraper/www_qtech_edu_cn/content.txt', 'r') as file:
    jieba.load_userdict('dict.txt')
    count = 0
    article = file.readline()

    while article:
        title = re.sub(r'来源：.*$', '',  article)
        date = re.search(r'\d{4}-\d{2}-\d{2}', article).group()
        content = re.sub(r'^.*发布人：[a-z]*', '',  article)

        title_words = list(pseg.cut(title))
        name = tuple(title_words[0])[0]
        pos = tuple(title_words[1])[1]

        # 共指消解
        for key in coreferences.keys():
            if key == name:
                name = coreferences[key]
                break

        for entity in entities:
            if entity['name'] == name and pos == 'v':
                verb = tuple(title_words[1])[0]
                obj = re.split(verb, title)[1]
                people = []

                for (word, _) in pseg.cut(content):
                    for person in person_nodes:
                        if word == person['name']:
                            people.append(word)

                people = set(people)

                print('命中：', date, name, verb, obj, people)
                obj_info = {
                    'name': obj,
                    'date': date,
                    'origin': content,
                }
                name_node = None
                for node in entities:
                    if node['name'] == name:
                        name_node = node
                        break
                if name_node:
                    obj_node = Node('活动', **obj_info)
                    print('创建 (活动) 节点')
                    qut_graph.create(Relationship(name_node, verb, obj_node))
                    print('创建关系 (主语)-[举办动词]->(活动)')
                    for person in people:
                        for person_node in person_nodes:
                            if person == person_node['name']:
                                qut_graph.create(Relationship(person_node, '参加', obj_node))
                                print('创建关系 (人)-[参加]->(活动)')

                count += 1
                break

        article = file.readline()

    print('抽取信息共计(条)：%d' % count)

